rm(list = ls())
# Load libraries
library(tidyverse)
library(data.table)
library(janitor)
library(leaflet)
library(ggplot2)
# Read in the data using data.table().
pm2004 <- data.table::fread("ad_viz_plotval_data_2004.csv")
pm2004 <- clean_names(pm2004)
pm2019 <- data.table::fread("ad_viz_plotval_data_2019.csv")
pm2019 <- clean_names(pm2019)
# Check dimensions
paste("The 2004 dataset has", dim(pm2004)[1], "observations and", dim(pm2004)[2], "variables.")
## [1] "The 2004 dataset has 19233 observations and 20 variables."
paste("The 2019 dataset has", dim(pm2019)[1], "observations and", dim(pm2019)[2], "variables.")
## [1] "The 2019 dataset has 53086 observations and 20 variables."
# Check headers
head(pm2004)
## date source site_id poc daily_mean_pm2_5_concentration units
## 1: 01/01/2004 AQS 60010007 1 11.0 ug/m3 LC
## 2: 01/02/2004 AQS 60010007 1 12.2 ug/m3 LC
## 3: 01/03/2004 AQS 60010007 1 16.5 ug/m3 LC
## 4: 01/04/2004 AQS 60010007 1 19.5 ug/m3 LC
## 5: 01/05/2004 AQS 60010007 1 11.5 ug/m3 LC
## 6: 01/06/2004 AQS 60010007 1 32.5 ug/m3 LC
## daily_aqi_value site_name daily_obs_count percent_complete
## 1: 46 Livermore 1 100
## 2: 51 Livermore 1 100
## 3: 60 Livermore 1 100
## 4: 67 Livermore 1 100
## 5: 48 Livermore 1 100
## 6: 94 Livermore 1 100
## aqs_parameter_code aqs_parameter_desc cbsa_code
## 1: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 2: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 3: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 4: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 5: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 6: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## cbsa_name state_code state county_code county
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## site_latitude site_longitude
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
head(pm2019)
## date source site_id poc daily_mean_pm2_5_concentration units
## 1: 01/01/2019 AQS 60010007 3 5.7 ug/m3 LC
## 2: 01/02/2019 AQS 60010007 3 11.9 ug/m3 LC
## 3: 01/03/2019 AQS 60010007 3 20.1 ug/m3 LC
## 4: 01/04/2019 AQS 60010007 3 28.8 ug/m3 LC
## 5: 01/05/2019 AQS 60010007 3 11.2 ug/m3 LC
## 6: 01/06/2019 AQS 60010007 3 2.7 ug/m3 LC
## daily_aqi_value site_name daily_obs_count percent_complete
## 1: 24 Livermore 1 100
## 2: 50 Livermore 1 100
## 3: 68 Livermore 1 100
## 4: 86 Livermore 1 100
## 5: 47 Livermore 1 100
## 6: 11 Livermore 1 100
## aqs_parameter_code aqs_parameter_desc cbsa_code
## 1: 88101 PM2.5 - Local Conditions 41860
## 2: 88101 PM2.5 - Local Conditions 41860
## 3: 88101 PM2.5 - Local Conditions 41860
## 4: 88101 PM2.5 - Local Conditions 41860
## 5: 88101 PM2.5 - Local Conditions 41860
## 6: 88101 PM2.5 - Local Conditions 41860
## cbsa_name state_code state county_code county
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## site_latitude site_longitude
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
# Check footers
tail(pm2004)
## date source site_id poc daily_mean_pm2_5_concentration units
## 1: 12/14/2004 AQS 61131003 1 11 ug/m3 LC
## 2: 12/17/2004 AQS 61131003 1 16 ug/m3 LC
## 3: 12/20/2004 AQS 61131003 1 17 ug/m3 LC
## 4: 12/23/2004 AQS 61131003 1 9 ug/m3 LC
## 5: 12/26/2004 AQS 61131003 1 24 ug/m3 LC
## 6: 12/29/2004 AQS 61131003 1 9 ug/m3 LC
## daily_aqi_value site_name daily_obs_count percent_complete
## 1: 46 Woodland-Gibson Road 1 100
## 2: 59 Woodland-Gibson Road 1 100
## 3: 61 Woodland-Gibson Road 1 100
## 4: 38 Woodland-Gibson Road 1 100
## 5: 76 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## aqs_parameter_code aqs_parameter_desc cbsa_code
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## cbsa_name state_code state county_code
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## county site_latitude site_longitude
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
tail(pm2019)
## date source site_id poc daily_mean_pm2_5_concentration units
## 1: 11/11/2019 AQS 61131003 1 13.5 ug/m3 LC
## 2: 11/17/2019 AQS 61131003 1 18.1 ug/m3 LC
## 3: 11/29/2019 AQS 61131003 1 12.5 ug/m3 LC
## 4: 12/17/2019 AQS 61131003 1 23.8 ug/m3 LC
## 5: 12/23/2019 AQS 61131003 1 1.0 ug/m3 LC
## 6: 12/29/2019 AQS 61131003 1 9.1 ug/m3 LC
## daily_aqi_value site_name daily_obs_count percent_complete
## 1: 54 Woodland-Gibson Road 1 100
## 2: 64 Woodland-Gibson Road 1 100
## 3: 52 Woodland-Gibson Road 1 100
## 4: 76 Woodland-Gibson Road 1 100
## 5: 4 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## aqs_parameter_code aqs_parameter_desc cbsa_code
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## cbsa_name state_code state county_code
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## county site_latitude site_longitude
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
# Check variable names
paste(colnames(pm2004))
## [1] "date" "source"
## [3] "site_id" "poc"
## [5] "daily_mean_pm2_5_concentration" "units"
## [7] "daily_aqi_value" "site_name"
## [9] "daily_obs_count" "percent_complete"
## [11] "aqs_parameter_code" "aqs_parameter_desc"
## [13] "cbsa_code" "cbsa_name"
## [15] "state_code" "state"
## [17] "county_code" "county"
## [19] "site_latitude" "site_longitude"
# Check variable types
sapply(pm2004, typeof)
## date source
## "character" "character"
## site_id poc
## "integer" "integer"
## daily_mean_pm2_5_concentration units
## "double" "character"
## daily_aqi_value site_name
## "integer" "character"
## daily_obs_count percent_complete
## "integer" "double"
## aqs_parameter_code aqs_parameter_desc
## "integer" "character"
## cbsa_code cbsa_name
## "integer" "character"
## state_code state
## "integer" "character"
## county_code county
## "integer" "character"
## site_latitude site_longitude
## "double" "double"
# Check for issues in key variables:
str(pm2004) # General readout
## Classes 'data.table' and 'data.frame': 19233 obs. of 20 variables:
## $ date : chr "01/01/2004" "01/02/2004" "01/03/2004" "01/04/2004" ...
## $ source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ site_id : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ poc : int 1 1 1 1 1 1 1 1 1 1 ...
## $ daily_mean_pm2_5_concentration: num 11 12.2 16.5 19.5 11.5 32.5 15.5 29.9 21 16.9 ...
## $ units : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ daily_aqi_value : int 46 51 60 67 48 94 58 88 70 61 ...
## $ site_name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ daily_obs_count : int 1 1 1 1 1 1 1 1 1 1 ...
## $ percent_complete : num 100 100 100 100 100 100 100 100 100 100 ...
## $ aqs_parameter_code : int 88502 88502 88502 88502 88502 88502 88502 88502 88502 88502 ...
## $ aqs_parameter_desc : chr "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" ...
## $ cbsa_code : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ cbsa_name : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ state_code : int 6 6 6 6 6 6 6 6 6 6 ...
## $ state : chr "California" "California" "California" "California" ...
## $ county_code : int 1 1 1 1 1 1 1 1 1 1 ...
## $ county : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ site_latitude : num 37.7 37.7 37.7 37.7 37.7 ...
## $ site_longitude : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(pm2004) #Summary statistics
## date source site_id poc
## Length:19233 Length:19233 Min. :60010007 Min. : 1.000
## Class :character Class :character 1st Qu.:60370002 1st Qu.: 1.000
## Mode :character Mode :character Median :60658001 Median : 1.000
## Mean :60588026 Mean : 1.816
## 3rd Qu.:60750006 3rd Qu.: 2.000
## Max. :61131003 Max. :12.000
##
## daily_mean_pm2_5_concentration units daily_aqi_value
## Min. : -0.10 Length:19233 Min. : 0.00
## 1st Qu.: 6.00 Class :character 1st Qu.: 25.00
## Median : 10.10 Mode :character Median : 42.00
## Mean : 13.13 Mean : 46.33
## 3rd Qu.: 16.30 3rd Qu.: 60.00
## Max. :251.00 Max. :301.00
##
## site_name daily_obs_count percent_complete aqs_parameter_code
## Length:19233 Min. :1 Min. :100 Min. :88101
## Class :character 1st Qu.:1 1st Qu.:100 1st Qu.:88101
## Mode :character Median :1 Median :100 Median :88101
## Mean :1 Mean :100 Mean :88267
## 3rd Qu.:1 3rd Qu.:100 3rd Qu.:88502
## Max. :1 Max. :100 Max. :88502
##
## aqs_parameter_desc cbsa_code cbsa_name state_code
## Length:19233 Min. :12540 Length:19233 Min. :6
## Class :character 1st Qu.:31080 Class :character 1st Qu.:6
## Mode :character Median :40140 Mode :character Median :6
## Mean :35328 Mean :6
## 3rd Qu.:41860 3rd Qu.:6
## Max. :49700 Max. :6
## NA's :1253
## state county_code county site_latitude
## Length:19233 Min. : 1.00 Length:19233 Min. :32.63
## Class :character 1st Qu.: 37.00 Class :character 1st Qu.:34.07
## Mode :character Median : 65.00 Mode :character Median :36.48
## Mean : 58.63 Mean :36.23
## 3rd Qu.: 75.00 3rd Qu.:38.10
## Max. :113.00 Max. :41.71
##
## site_longitude
## Min. :-124.2
## 1st Qu.:-121.6
## Median :-119.3
## Mean :-119.7
## 3rd Qu.:-117.9
## Max. :-115.5
##
summary(pm2004$daily_mean_pm2_5_concentration) #Key variable
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.10 6.00 10.10 13.13 16.30 251.00
mean(is.na(pm2004$daily_mean_pm2_5_concentration)) #No missing values
## [1] 0
hist(pm2004$daily_mean_pm2_5_concentration, breaks = 100, xlim=c(0,100)) #Check for outliers
str(pm2019) # General readout
## Classes 'data.table' and 'data.frame': 53086 obs. of 20 variables:
## $ date : chr "01/01/2019" "01/02/2019" "01/03/2019" "01/04/2019" ...
## $ source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ site_id : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ poc : int 3 3 3 3 3 3 3 3 3 3 ...
## $ daily_mean_pm2_5_concentration: num 5.7 11.9 20.1 28.8 11.2 2.7 2.8 7 3.1 7.1 ...
## $ units : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ daily_aqi_value : int 24 50 68 86 47 11 12 29 13 30 ...
## $ site_name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ daily_obs_count : int 1 1 1 1 1 1 1 1 1 1 ...
## $ percent_complete : num 100 100 100 100 100 100 100 100 100 100 ...
## $ aqs_parameter_code : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
## $ aqs_parameter_desc : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
## $ cbsa_code : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ cbsa_name : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ state_code : int 6 6 6 6 6 6 6 6 6 6 ...
## $ state : chr "California" "California" "California" "California" ...
## $ county_code : int 1 1 1 1 1 1 1 1 1 1 ...
## $ county : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ site_latitude : num 37.7 37.7 37.7 37.7 37.7 ...
## $ site_longitude : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(pm2019) #Summary statistics
## date source site_id poc
## Length:53086 Length:53086 Min. :60010007 Min. : 1.000
## Class :character Class :character 1st Qu.:60310004 1st Qu.: 1.000
## Mode :character Mode :character Median :60612003 Median : 3.000
## Mean :60565291 Mean : 2.562
## 3rd Qu.:60771002 3rd Qu.: 3.000
## Max. :61131003 Max. :21.000
##
## daily_mean_pm2_5_concentration units daily_aqi_value
## Min. : -2.200 Length:53086 Min. : 0.00
## 1st Qu.: 4.000 Class :character 1st Qu.: 17.00
## Median : 6.500 Mode :character Median : 27.00
## Mean : 7.734 Mean : 30.56
## 3rd Qu.: 9.900 3rd Qu.: 41.00
## Max. :120.900 Max. :185.00
##
## site_name daily_obs_count percent_complete aqs_parameter_code
## Length:53086 Min. :1 Min. :100 Min. :88101
## Class :character 1st Qu.:1 1st Qu.:100 1st Qu.:88101
## Mode :character Median :1 Median :100 Median :88101
## Mean :1 Mean :100 Mean :88214
## 3rd Qu.:1 3rd Qu.:100 3rd Qu.:88502
## Max. :1 Max. :100 Max. :88502
##
## aqs_parameter_desc cbsa_code cbsa_name state_code
## Length:53086 Min. :12540 Length:53086 Min. :6
## Class :character 1st Qu.:31080 Class :character 1st Qu.:6
## Mode :character Median :40140 Mode :character Median :6
## Mean :35841 Mean :6
## 3rd Qu.:41860 3rd Qu.:6
## Max. :49700 Max. :6
## NA's :4181
## state county_code county site_latitude
## Length:53086 Min. : 1.00 Length:53086 Min. :32.58
## Class :character 1st Qu.: 31.00 Class :character 1st Qu.:34.14
## Mode :character Median : 61.00 Mode :character Median :36.63
## Mean : 56.39 Mean :36.35
## 3rd Qu.: 77.00 3rd Qu.:37.97
## Max. :113.00 Max. :41.76
##
## site_longitude
## Min. :-124.2
## 1st Qu.:-121.6
## Median :-119.8
## Mean :-119.8
## 3rd Qu.:-118.1
## Max. :-115.5
##
summary(pm2019$daily_mean_pm2_5_concentration) #Key variable
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.200 4.000 6.500 7.734 9.900 120.900
mean(is.na(pm2019$daily_mean_pm2_5_concentration)) #No missing values
## [1] 0
hist(pm2019$daily_mean_pm2_5_concentration, breaks = 100, xlim=c(0,100)) #Check for outliers
Summary of findings: While the Max values of PM 2.5 in each of these data sets is extremely high, they don’t seem to be errors - just extreme events.
# Combine the two years of data into one data frame.
pm <- rbind(pm2004, pm2019)
# Use the Date variable to create a new column for year, which will serve as an identifier.
dates <- pm$date
dates2 <- as.POSIXct(dates, format = "%m/%d/%Y")
dates3 <- format(dates2, format="%Y")
pm <- mutate(pm, year = dates3) #New column "year" added
# Change the names of the key variables so that they are easier to refer to in your code.
pm <- rename(pm, pm25 = daily_mean_pm2_5_concentration)
pm$year <- as.numeric(pm$year) #so that colorNumeric works, but can instead use colorFactor
# Create a basic map in leaflet() that shows the locations of the sites (make sure to use different colors for each year).
pal <- colorNumeric(palette = "RdYlBu", domain=c(2004, 2019))
#colorNumeric doesn't work any more
leaflet(pm) %>%
addProviderTiles('OpenStreetMap') %>%
addCircles(lat=~site_latitude,lng=~site_longitude, color=pal(pm$year), opacity=1, fillOpacity=1, radius=100)
#addLegend('bottomleft', pal=pal, values=~c(2004, 2019),
#title='Site Locations', opacity=1)
#this legend is driving me crazy
Summarize the spatial distribution of the monitoring sites: ???
State County Site in Los Angeles